# load the packages
suppressMessages({
  library(data.table)
  library(dplyr)
  library(tidyverse)
  library(psych)
  library(scales)
  library(lubridate)
  library(ggplot2) 
  library(hrbrthemes)
  library(plotly)
  library(forecast) 
  library(prophet) 
})

set.seed(42)

Data

calendar.csv - Contains information about the dates on which the products are sold. sell_prices.csv - Contains information about the price of the products sold per store and date. sales_train_validation.csv - Contains the historical daily unit sales data per product and store [d_1 - d_1913].

# read the data
calendar <- fread('data/calendar.csv')
price <- fread('data/sell_prices.csv')
sales <- fread('data/sales_train_validation.csv')
head(calendar)
##          date wm_yr_wk   weekday wday month year   d event_name_1
## 1: 2011-01-29    11101  Saturday    1     1 2011 d_1             
## 2: 2011-01-30    11101    Sunday    2     1 2011 d_2             
## 3: 2011-01-31    11101    Monday    3     1 2011 d_3             
## 4: 2011-02-01    11101   Tuesday    4     2 2011 d_4             
## 5: 2011-02-02    11101 Wednesday    5     2 2011 d_5             
## 6: 2011-02-03    11101  Thursday    6     2 2011 d_6             
##    event_type_1 event_name_2 event_type_2 snap_CA snap_TX snap_WI
## 1:                                              0       0       0
## 2:                                              0       0       0
## 3:                                              0       0       0
## 4:                                              1       1       0
## 5:                                              1       0       1
## 6:                                              1       1       1
head(price)
##    store_id       item_id wm_yr_wk sell_price
## 1:     CA_1 HOBBIES_1_001    11325       9.58
## 2:     CA_1 HOBBIES_1_001    11326       9.58
## 3:     CA_1 HOBBIES_1_001    11327       8.26
## 4:     CA_1 HOBBIES_1_001    11328       8.26
## 5:     CA_1 HOBBIES_1_001    11329       8.26
## 6:     CA_1 HOBBIES_1_001    11330       8.26
# train test split
basic <- select(sales, 1:6)
date <- select(sales, -1:-6)
date_num <- dim(date)[2]
test_size <- 28
train_size <- date_num - 28
train <- select(date, 1:all_of(train_size))
test <- select(date, (all_of(train_size)+1):all_of(date_num))
train <- cbind(basic, train)
test <- cbind(basic, test)

The data range from 2011-01-29 to 2016-06-19.

calendar$date <- as.Date(calendar$date, format='%Y-%m-%d')
calendar %>% summarize(min_date=min(date), max_date=max(date))
##     min_date   max_date
## 1 2011-01-29 2016-06-19

The data range for the training data set is from 2011-01-29 to 2016-03-27, and the data range for the test set is from 2016-03-28 to 2016-04-24.

train_date <- calendar$date[1:train_size]
test_date <- calendar$date[(train_size+1):(train_size+28)]
min(train_date)
## [1] "2011-01-29"
max(train_date)
## [1] "2016-03-27"
min(test_date)
## [1] "2016-03-28"
max(test_date)
## [1] "2016-04-24"

In our dataset, we have 3 categories of 3049 items in 7 departments. Those items were sold in 10 stores of 3 states.

# count unique item, dept, category, store, and state
length(unique(train[['item_id']]))
## [1] 3049
length(unique(train[['dept_id']]))
## [1] 7
length(unique(train[['cat_id']]))
## [1] 3
length(unique(train[['store_id']]))
## [1] 10
length(unique(train[['state_id']]))
## [1] 3
(top_item <- head(count(train, item_id, sort = TRUE), 5))
## # A tibble: 5 x 2
##   item_id         n
##   <chr>       <int>
## 1 FOODS_1_001    10
## 2 FOODS_1_002    10
## 3 FOODS_1_003    10
## 4 FOODS_1_004    10
## 5 FOODS_1_005    10
(top_dept <- count(train, dept_id, sort = TRUE))
## # A tibble: 7 x 2
##   dept_id         n
##   <chr>       <int>
## 1 FOODS_3      8230
## 2 HOUSEHOLD_1  5320
## 3 HOUSEHOLD_2  5150
## 4 HOBBIES_1    4160
## 5 FOODS_2      3980
## 6 FOODS_1      2160
## 7 HOBBIES_2    1490
(top_cat <- count(train, cat_id, sort = TRUE))
## # A tibble: 3 x 2
##   cat_id        n
##   <chr>     <int>
## 1 FOODS     14370
## 2 HOUSEHOLD 10470
## 3 HOBBIES    5650
(top_store <- count(train, store_id, sort = TRUE))
## # A tibble: 10 x 2
##    store_id     n
##    <chr>    <int>
##  1 CA_1      3049
##  2 CA_2      3049
##  3 CA_3      3049
##  4 CA_4      3049
##  5 TX_1      3049
##  6 TX_2      3049
##  7 TX_3      3049
##  8 WI_1      3049
##  9 WI_2      3049
## 10 WI_3      3049
(top_state <- count(train, state_id, sort = TRUE))
## # A tibble: 3 x 2
##   state_id     n
##   <chr>    <int>
## 1 CA       12196
## 2 TX        9147
## 3 WI        9147

FOODS category and FOODS_3 department have the largest number of products.

ggplot(train, aes(x=dept_id)) +
  geom_bar(fill='steelblue') + 
  ggtitle('Item Count by Department') +
  theme(plot.title = element_text(hjust = 0.5))

ggplot(train, aes(x=cat_id)) +
  geom_bar(fill='steelblue') + 
  ggtitle('Item Count by Category') +
  theme(plot.title = element_text(hjust = 0.5))

There are 4 stores in CA, 3 in TX, and 3 in WI.

(store_state <- train %>% group_by(state_id) %>% summarize(unique_stores=n_distinct(store_id)))
## # A tibble: 3 x 2
##   state_id unique_stores
##   <chr>            <int>
## 1 CA                   4
## 2 TX                   3
## 3 WI                   3
ggplot(store_state, aes(state_id, unique_stores)) +
  geom_col(fill='steelblue') + 
  ggtitle('Number of Stores by State') +
  theme(plot.title = element_text(hjust = 0.5))

# read the data
train <- fread('data/train.csv')
test <- fread('data/test.csv')
head(train)
##    V1                            id       item_id   dept_id  cat_id
## 1:  1 HOBBIES_1_001_CA_1_validation HOBBIES_1_001 HOBBIES_1 HOBBIES
## 2:  2 HOBBIES_1_002_CA_1_validation HOBBIES_1_002 HOBBIES_1 HOBBIES
## 3:  3 HOBBIES_1_003_CA_1_validation HOBBIES_1_003 HOBBIES_1 HOBBIES
## 4:  4 HOBBIES_1_004_CA_1_validation HOBBIES_1_004 HOBBIES_1 HOBBIES
## 5:  5 HOBBIES_1_005_CA_1_validation HOBBIES_1_005 HOBBIES_1 HOBBIES
## 6:  6 HOBBIES_1_006_CA_1_validation HOBBIES_1_006 HOBBIES_1 HOBBIES
##    store_id state_id   d sales       date wm_yr_wk  weekday wday month
## 1:     CA_1       CA d_1     0 2011-01-29    11101 Saturday    1     1
## 2:     CA_1       CA d_1     0 2011-01-29    11101 Saturday    1     1
## 3:     CA_1       CA d_1     0 2011-01-29    11101 Saturday    1     1
## 4:     CA_1       CA d_1     0 2011-01-29    11101 Saturday    1     1
## 5:     CA_1       CA d_1     0 2011-01-29    11101 Saturday    1     1
## 6:     CA_1       CA d_1     0 2011-01-29    11101 Saturday    1     1
##    year event_name_1 event_type_1 event_name_2 event_type_2 snap_CA
## 1: 2011                                                           0
## 2: 2011                                                           0
## 3: 2011                                                           0
## 4: 2011                                                           0
## 5: 2011                                                           0
## 6: 2011                                                           0
##    snap_TX snap_WI sell_price
## 1:       0       0         NA
## 2:       0       0         NA
## 3:       0       0         NA
## 4:       0       0         NA
## 5:       0       0         NA
## 6:       0       0         NA
head(test)
##    V1                            id       item_id   dept_id  cat_id
## 1:  1 HOBBIES_1_001_CA_1_validation HOBBIES_1_001 HOBBIES_1 HOBBIES
## 2:  2 HOBBIES_1_002_CA_1_validation HOBBIES_1_002 HOBBIES_1 HOBBIES
## 3:  3 HOBBIES_1_003_CA_1_validation HOBBIES_1_003 HOBBIES_1 HOBBIES
## 4:  4 HOBBIES_1_004_CA_1_validation HOBBIES_1_004 HOBBIES_1 HOBBIES
## 5:  5 HOBBIES_1_005_CA_1_validation HOBBIES_1_005 HOBBIES_1 HOBBIES
## 6:  6 HOBBIES_1_006_CA_1_validation HOBBIES_1_006 HOBBIES_1 HOBBIES
##    store_id state_id      d sales       date wm_yr_wk weekday wday month
## 1:     CA_1       CA d_1886     1 2016-03-28    11609  Monday    3     3
## 2:     CA_1       CA d_1886     1 2016-03-28    11609  Monday    3     3
## 3:     CA_1       CA d_1886     0 2016-03-28    11609  Monday    3     3
## 4:     CA_1       CA d_1886     0 2016-03-28    11609  Monday    3     3
## 5:     CA_1       CA d_1886     1 2016-03-28    11609  Monday    3     3
## 6:     CA_1       CA d_1886     0 2016-03-28    11609  Monday    3     3
##    year event_name_1 event_type_1 event_name_2 event_type_2 snap_CA
## 1: 2016           NA           NA           NA           NA       0
## 2: 2016           NA           NA           NA           NA       0
## 3: 2016           NA           NA           NA           NA       0
## 4: 2016           NA           NA           NA           NA       0
## 5: 2016           NA           NA           NA           NA       0
## 6: 2016           NA           NA           NA           NA       0
##    snap_TX snap_WI sell_price
## 1:       0       0       8.26
## 2:       0       0       3.97
## 3:       0       0       2.97
## 4:       0       0       4.64
## 5:       0       0       2.88
## 6:       0       0       0.96

Sales

General Sales

We can see an upward trend from 2011 to 2016. We can also observe seasonality with annual peaks around September and October, and a dip on Christmas every year when all stores closed.

daily_sales_df <- train %>% group_by(date) %>% summarize(daily_sales=sum(sales))
daily_sales_df$date <- as.Date(daily_sales_df$date)
head(daily_sales_df)
## # A tibble: 6 x 2
##   date       daily_sales
##   <date>           <int>
## 1 2011-01-29       32631
## 2 2011-01-30       31749
## 3 2011-01-31       23783
## 4 2011-02-01       25412
## 5 2011-02-02       19146
## 6 2011-02-03       29211
fig <- daily_sales_df %>%
  ggplot(aes(x=date, y=daily_sales, group=1)) +
  geom_area(fill='#69b3a2', alpha=0.5) +
  geom_line(color='#69b3a2') +
  geom_smooth(method='lm', formula=y~x, se=FALSE, size=0.5, color='gray20') +
  labs(x='Date', y='Sales', title='Daily Sales') +
  scale_x_date(date_breaks='3 month', date_labels='%b %y') +
  theme_ipsum() +
  theme(axis.text.x=element_text(angle=45, hjust=1))

ggplotly(fig, dynamicTicks=TRUE)

Sales by Item

There are 3049 distinct items in the dataset. We will randomly selected 20 items to plot here.

set.seed(42)
random_items <- sample(unique(train$item_id), 20)

item_sales_df <- train %>% 
  filter(item_id %in% random_items) %>%
  group_by(date, item_id) %>% 
  summarize(item_sales=sum(sales))

item_sales_df$date <- as.Date(item_sales_df$date)

fig <- item_sales_df %>%
  ggplot(aes(x=date, y=item_sales, col=item_id)) +
  geom_line() +
  labs(x='Date', y='Sales', title='Daily Sales by Item') +
  scale_x_date(date_breaks='3 month', date_labels='%b %y') +
  theme_ipsum() +
  theme(axis.text.x=element_text(angle=45, hjust=1))

ggplotly(fig, dynamicTicks=TRUE)

Sales by Department

dept_sales_df <- train %>% group_by(date, dept_id) %>% summarize(dept_sales=sum(sales))
dept_sales_df$date <- as.Date(dept_sales_df$date)

fig <- dept_sales_df %>%
  ggplot(aes(x=date, y=dept_sales, col=dept_id)) +
  geom_line() +
  labs(x='Date', y='Sales', title='Daily Sales by Department') +
  scale_x_date(date_breaks='3 month', date_labels='%b %y') +
  theme_ipsum() +
  theme(axis.text.x=element_text(angle=45, hjust=1))

ggplotly(fig, dynamicTicks=TRUE)

Sales by Category

cat_sales_df <- train %>% group_by(date, cat_id) %>% summarize(cat_sales=sum(sales))
cat_sales_df$date <- as.Date(cat_sales_df$date)

fig <- cat_sales_df %>%
  ggplot(aes(x=date, y=cat_sales, col=cat_id)) +
  geom_line() +
  labs(x='Date', y='Sales', title='Daily Sales by Product Category') +
  scale_x_date(date_breaks='3 month', date_labels='%b %y') +
  theme_ipsum() +
  theme(axis.text.x=element_text(angle=45, hjust=1))

ggplotly(fig, dynamicTicks=TRUE)

Sales by Store

store_sales_df <- train %>% group_by(date, store_id) %>% summarize(store_sales=sum(sales))
store_sales_df$date <- as.Date(store_sales_df$date)

fig <- store_sales_df %>%
  ggplot(aes(x=date, y=store_sales, col=store_id)) +
  geom_line() +
  labs(x='Date', y='Sales', title='Daily Sales by Store') +
  scale_x_date(date_breaks='3 month', date_labels='%b %y') +
  theme_ipsum() +
  theme(axis.text.x=element_text(angle=45, hjust=1))

ggplotly(fig, dynamicTicks=TRUE)

Sales by State

state_sales_df <- train %>% group_by(date, state_id) %>% summarize(state_sales=sum(sales))
state_sales_df$date <- as.Date(state_sales_df$date)

fig <- state_sales_df %>%
  ggplot(aes(x=date, y=state_sales, col=state_id)) +
  geom_line() +
  labs(x='Date', y='Sales', title='Daily Sales by State') +
  scale_x_date(date_breaks='3 month', date_labels='%b %y') +
  theme_ipsum() +
  theme(axis.text.x=element_text(angle=45, hjust=1))

ggplotly(fig, dynamicTicks=TRUE)

Price

Price by Item

There are 3049 distinct items in the dataset. We will randomly selected 20 items to plot here.

set.seed(42)
random_items <- sample(unique(train$item_id), 20)

item_price_df <- train %>% 
  filter(item_id %in% random_items) %>%
  group_by(date, item_id) %>% 
  summarize(item_price=mean(sell_price))

item_price_df$date <- as.Date(item_price_df$date)

fig <- item_price_df %>%
  ggplot(aes(x=date, y=item_price, col=item_id)) +
  geom_line() +
  labs(x='Date', y='Price', title='Item Daily Average Price') +
  scale_x_date(date_breaks='3 month', date_labels='%b %y') +
  theme_ipsum() +
  theme(axis.text.x=element_text(angle=45, hjust=1))

ggplotly(fig, dynamicTicks=TRUE)

Price by Department

dept_price_df <- train %>% group_by(date, dept_id) %>% summarize(dept_price=mean(sell_price, 
na.rm=TRUE))

dept_price_df$date <- as.Date(dept_price_df$date)

fig <- dept_price_df %>%
  ggplot(aes(x=date, y=dept_price, col=dept_id)) +
  geom_line() +
  labs(x='Date', y='Price', title='Department Daily Average Price') +
  scale_x_date(date_breaks='3 month', date_labels='%b %y') +
  theme_ipsum() +
  theme(axis.text.x=element_text(angle=45, hjust=1))

ggplotly(fig, dynamicTicks=TRUE)

Price by Category

cat_price_df <- train %>% group_by(date, cat_id) %>% summarize(cat_price=mean(sell_price, 
na.rm=TRUE))

cat_price_df$date <- as.Date(cat_price_df$date)

fig <- cat_price_df %>%
  ggplot(aes(x=date, y=cat_price, col=cat_id)) +
  geom_line() +
  labs(x='Date', y='Price', title='Daily Average Price by Category') +
  scale_x_date(date_breaks='3 month', date_labels='%b %y') +
  theme_ipsum() +
  theme(axis.text.x=element_text(angle=45, hjust=1))

ggplotly(fig, dynamicTicks=TRUE)

Price by Store

store_price_df <- train %>% group_by(date, store_id) %>% summarize(store_price=mean(sell_price, 
na.rm=TRUE))

store_price_df$date <- as.Date(store_price_df$date)

fig <- store_price_df %>%
  ggplot(aes(x=date, y=store_price, col=store_id)) +
  geom_line() +
  labs(x='Date', y='Price', title='Daily Average Price by Store') +
  scale_x_date(date_breaks='3 month', date_labels='%b %y') +
  theme_ipsum() +
  theme(axis.text.x=element_text(angle=45, hjust=1))

ggplotly(fig, dynamicTicks=TRUE)

Price by State

state_price_df <- train %>% group_by(date, state_id) %>% summarize(state_price=mean(sell_price, 
na.rm=TRUE))

state_price_df$date <- as.Date(state_price_df$date)

fig <- state_price_df %>%
  ggplot(aes(x=date, y=state_price, col=state_id)) +
  geom_line() +
  labs(x='Date', y='Price', title='Daily Average Price by State') +
  scale_x_date(date_breaks='3 month', date_labels='%b %y') +
  theme_ipsum() +
  theme(axis.text.x=element_text(angle=45, hjust=1))

ggplotly(fig, dynamicTicks=TRUE)